Reference

Dataset: Suvorov, Anton; Kim, Bernard; Wang, Jeremy; Armstrong, Ellie; Peede, David; D’Agostino, Emmanuel R. R.; et al. (2020). Widespread introgression across a phylogeny of 155 Drosophila genomes. figshare. Dataset. https://doi.org/10.6084/m9.figshare.13264697.v1

Paper: Suvorov A, Kim BY, Wang J, Armstrong EE, Peede D, D’Agostino ERR, Price DK, Waddell P, Lang M, Courtier-Orgogozo V, David JR, Petrov D, Matute DR, Schrider DR, Comeault AA. Widespread introgression across a phylogeny of 155 Drosophila genomes. Curr Biol. 2022 Jan 10;32(1):111-123.e5. doi: 10.1016/j.cub.2021.10.052. Epub 2021 Nov 16. PMID: 34788634; PMCID: PMC8752469.

Target alignment

Name: EOG09150AOE

Outgroup: Anopheles

Type: Aligned DNA BUSCO loci

library(Biostrings)
sequences <- readDNAStringSet('Data/mixture_class_single_gene/EOG09150A0E.fna.aln')

Alignment summaries

# model
one_line <- readLines('Data/one_class_single_gene/one_class_EOG09150A0E.iqtree')
mix_line <- readLines("Data/mixture_class_single_gene/mix_class_EOG09150A0E.iqtree")
library(data.table)
align_line <- grep("SEQUENCE ALIGNMENT", one_line, value = TRUE)
align_sum <- one_line[(which(one_line == align_line)+2):(which(one_line == align_line)+8)]
align_sum_table <- data.table(Line = align_sum)
setnames(align_sum_table, new = "Alignment summaries")
print(align_sum_table)
##                                                                           Alignment summaries
## 1:                                                                                           
## 2:                                       Input data: 155 sequences with 1957 nucleotide sites
## 3:                                   Number of constant sites: 1022 (= 52.2228% of all sites)
## 4: Number of invariant (constant or ambiguous constant) sites: 1022 (= 52.2228% of all sites)
## 5:                                                 Number of parsimony informative sites: 552
## 6:                                                      Number of distinct site patterns: 930
## 7:

Command of model

# Command of run one class model iq-tree
iqtree_one_model_command <- '% /Users/lindsaybai/Desktop/BIOL8706/iqtree-2.2.2.7.modelmix-MacOSX/bin/iqtree2 -s /Users/lindsaybai/Desktop/BIOL8706/one_gene/nad1alignment.fasta   -B 1000 -T AUTO'

# Command of run mixture class model iq-tree
iqtree_mixture_command <- '/Users/lindsaybai/Desktop/BIOL8706/iqtree-2.2.2.7.modelmix-MacOSX/bin/iqtree2  -s /Users/lindsaybai/Desktop/BIOL8706/one_gene_mixture/nad1alignment.fasta   -m "ESTMIXNUM" -mrate E,I,G,I+G,R,I+R -opt_qmix_criteria 1'

system(iqtree_one_model_command)
system(iqtree_mixture_command)

Tree Topology

library(ape)
library(phytools)
# Read the tree files
Mixture <- read.tree("Data/mixture_class_single_gene/mix_class_EOG09150A0E.treefile")
One_class <- read.tree("Data/one_class_single_gene/one_class_EOG09150A0E.treefile")
Species <- read.tree("Data/mixture_class_single_gene/EOG09150A0E.fna.aln.tr.treefile")

Phylograms plot

par(mfrow = c(1, 2))  # Set the plotting layout to 1 row and 2 columns
plot(One_class, main = "One class model tree")  # Plot the first tree
plot(Mixture, main = "Mixture class model tree")  # Plot the second tree

Cophylogeny plot

## create co-phylogenetic object
wasp.cophylo<-cophylo(Mixture,One_class)
## Rotating nodes to optimize matching...
## Done.
## plot co-phylogenies
plot(wasp.cophylo,link.type="curved",link.lwd=4,
 link.lty="solid",link.col=make.transparent("red",
 0.25))

par(mar=c(5.1,4.1,4.1,2.1))

Phylogenies inferred using these 3 approaches only differed in 2 trees:

  1. D watanabei D punjabiensis was either have paraphyletic relationships to D. kikkawai and D. leontia or have paraphyletic relationships with D. seguy, D. nikananu, D. vulcana, D spaffchauvacae, D bocquet, D burlai, D. jambulina, D. bakoue

  2. D wassermani form monophyletic lineage sister to the D. acanthoptera or have paraphyletic relationships where D pachea is sister to the D. acanthoptera

  3. D paucipunta form monophyletic lineage sister to the D prolacticillia or have paraphyletic relationships with the D prolacticillia

Table of topological distance metrics

# Load required packages
library(ape)
library(phangorn)

# Placeholder data (replace these with your actual data)
gene_names <- c("EOG09150AOE")
tree_names <- c("One_class", "Mixture", "Species")
tree_files <- list(
  Mixture = read.tree("Data/mixture_class_single_gene/mix_class_EOG09150A0E.treefile"),
  One_class = read.tree("Data/one_class_single_gene/one_class_EOG09150A0E.treefile"),
  Species = read.tree("Data/mixture_class_single_gene/EOG09150A0E.fna.aln.tr.treefile")
)

# Create an empty data frame to store the results
result_df <- data.frame(
  metric = character(0),
  tree1_name = character(0),
  tree2_name = character(0),
  gene_name = character(0),
  RF_distance = numeric(0),
  nRF_distance = numeric(0),
  wRF_distance = numeric(0),
  KF_distance = numeric(0),
  PD_distance = numeric(0),
  wPD_distance = numeric(0)
)

# Loop through the combinations of gene names and tree pairs
for (gene in gene_names) {
    tree_combinations <- combn(length(tree_files), 2, simplify = FALSE)
    for (comb in tree_combinations) {
        i <- comb[1]
        j <- comb[2]
        
        tree1 <- tree_files[[i]]
        tree2 <- tree_files[[j]]
        tree1_name <- tree_names[i]
        tree2_name <- tree_names[j]
      
       # Placeholder for distance calculation (replace with actual distance calculation function)
        RF_dist <- RF.dist(tree1, tree2, normalize = FALSE, check.labels = TRUE, rooted = FALSE)
        nRF_dist <-RF.dist(tree1, tree2, normalize = TRUE, check.labels = TRUE, rooted = FALSE)
        wRF_dist <- wRF.dist(tree1, tree2, normalize = FALSE, check.labels = TRUE, rooted = FALSE)
        KF_dist <- KF.dist(tree1, tree2, check.labels = TRUE, rooted = FALSE)
        PD_dist <- path.dist(tree1, tree2, check.labels = TRUE, use.weight = FALSE)
        wPD_dist <- path.dist(tree1, tree2, check.labels = TRUE, use.weight = TRUE)

      
      # Add the results to the data frame
      result_df <- rbind(result_df, data.frame(
        gene_name = gene,
        tree1_name = tree1_name,
        tree2_name = tree2_name,
        RF_distance = RF_dist,
        nRF_distance = nRF_dist,
        wRF_distance = wRF_dist,
        KF_distance = KF_dist,
        PD_distance = PD_dist,
        wPD_distance = wPD_dist
      ))
    }
}

# Print the resulting data frame
print(result_df)
##     gene_name tree1_name tree2_name RF_distance nRF_distance wRF_distance
## 1 EOG09150AOE  One_class    Mixture          14   0.04605263    1.8220541
## 2 EOG09150AOE  One_class    Species          22   0.07236842    1.3121051
## 3 EOG09150AOE    Mixture    Species          14   0.04605263    0.6060271
##   KF_distance PD_distance wPD_distance
## 1   0.6735708    91.94564    10.177523
## 2   0.4911228   105.19506     7.137974
## 3   0.1920068    62.33779     3.317044

Branch Lengths

Summary parameters table

# tree length & Sum_int & prop_int
one_length <- sub("Total tree length \\(sum of branch lengths\\): ","\\1",grep("Total tree length \\(sum of branch lengths\\): ", one_line, value = TRUE))
mix_length <- sub("Total tree length \\(sum of branch lengths\\): ","\\1",grep("Total tree length \\(sum of branch lengths\\): ", mix_line, value = TRUE))

one_sum_int <- sub("Sum of internal branch lengths: ([0-9.]+).*","\\1",grep("Sum of internal branch lengths: ([0-9.]+).*", one_line, value = TRUE))
mix_sum_int <- sub("Sum of internal branch lengths: ([0-9.]+).*","\\1",grep("Sum of internal branch lengths: ([0-9.]+).*", mix_line, value = TRUE))

one_prop_int <- sub(" of tree length", "", sub(".*\\(([^)]+)\\).*","\\1",grep("Sum of internal branch lengths: ([0-9.]+).*", one_line, value = TRUE)))
mix_prop_int <- sub(" of tree length", "", sub(".*\\(([^)]+)\\).*","\\1",grep("Sum of internal branch lengths: ([0-9.]+).*", mix_line, value = TRUE)))
# Provided Newick tree string
Mixture_txt <- readLines("Data/mixture_class_single_gene/mix_class_EOG09150A0E.treefile")
One_class_txt <- readLines("Data/one_class_single_gene/one_class_EOG09150A0E.treefile")

branch_len_mix <- as.numeric(gsub("([0-9.]+).*", "\\1", strsplit(Mixture_txt, ":")[[1]])[-1])
branch_len_one <- as.numeric(gsub("([0-9.]+).*", "\\1", strsplit(One_class_txt, ":")[[1]])[-1])

sum_branch_len_one <- summary(branch_len_one)
sum_branch_len_mix <- summary(branch_len_mix)
# Create a dataframe with the provided summary statistics
df <- data.frame(
  Tree_Length = c(one_length, mix_length),
  Sum_int = c(one_sum_int, mix_sum_int),
  prop_int = c(one_prop_int, mix_prop_int),
  min = c(sum_branch_len_one[1], sum_branch_len_mix[1]),
  Qu_1st = c(sum_branch_len_one[2], sum_branch_len_mix[2]),
  Median = c(sum_branch_len_one[3], sum_branch_len_mix[3]),
  Mean = c(sum_branch_len_one[4], sum_branch_len_mix[4]),
  Qu_3rd = c(sum_branch_len_one[5], sum_branch_len_mix[5]),
  Max = c(sum_branch_len_one[6], sum_branch_len_mix[6]),
  gene_name = rep(gene_names,2),
  row.names = c("One model", "Mixture model"))
# Print the dataframe
print(df)
##               Tree_Length Sum_int prop_int        min      Qu_1st      Median
## One model          9.7422  2.5724 26.4050% 1.0415e-06 0.004217371 0.009888112
## Mixture model     11.4917  2.8861 25.1151% 1.0112e-06 0.004586520 0.010936734
##                     Mean     Qu_3rd      Max   gene_name
## One model     0.03173342 0.02522135 1.965171 EOG09150AOE
## Mixture model 0.03743218 0.02791189 2.604123 EOG09150AOE

Faceted histogram

# Assuming you have loaded the 'ggplot2' package and have the necessary data

data <- data.frame(
  model = rep(c("One class", "Mixture"), each = length(branch_len_one)),
  branch_length = c(branch_len_one, branch_len_mix)
)

# Create a faceted histogram
library("ggplot2")
ggplot(data, aes(x = branch_length)) +
  geom_histogram(binwidth = 0.1, fill = "blue", color = "black") +
  facet_grid(model ~ ., scales = "free_y") +  # Facet by 'model', free y-axis scales
  labs(x = "Branch Length", y = "Frequency") +
  theme_minimal()

ECDF plot

# Create an ECDF plot
ggplot(data, aes(x = branch_length, color = model)) +
  stat_ecdf(geom = "step") +
  labs(x = "Branch Length", y = "ECDF") +
  scale_color_manual(values = c("One class" = "blue", "Mixture" = "red")) +
  theme_minimal() +
  theme(legend.position = "top")

Models

Description

  1. One Class Model
library(data.table)
library(stringr)
one_model <- grep("^Model of substitution:", one_line, value = TRUE)
one_class_mod_lines <- one_line[(which(one_line == one_model)):(which(one_line == one_model)+9)]
one_class_mod_table <- data.table(Line = one_class_mod_lines)
setnames(one_class_mod_table, new = "One Class Model")
one_class_mod_table
##                       One Class Model
##  1: Model of substitution: TIM2e+I+R4
##  2:                                  
##  3:                 Rate parameter R:
##  4:                                  
##  5:                       A-C: 1.3727
##  6:                       A-G: 2.7277
##  7:                       A-T: 1.3727
##  8:                       C-G: 1.0000
##  9:                       C-T: 4.7261
## 10:                       G-T: 1.0000
mix_model <- grep("^Mixture model of substitution:", mix_line, value = TRUE)
mix_df1 <- data.table(Line = mix_model)
setnames(mix_df1, new = "Mix Class Model")
print(mix_df1)
##                                               Mix Class Model
## 1: Mixture model of substitution: MIX{TIM2+FO,JC,HKY+FO}+I+G4
  1. Mix Class Model
# Example lines
Component_mix_model <- mix_line[(which(mix_line == mix_model)+3):(which(mix_line == mix_model)+6)]
# Initialize an empty vector to store words
all_words <- c()
# Iterate through each line
for (line in Component_mix_model) {
words <- strsplit(line, " ")[[1]]
      non_empty_words <- words[words != ""]
      all_words <- c(all_words, non_empty_words)
    }

    matrix_data <- matrix(all_words,
                       nrow = 3, ncol = 5, byrow = TRUE)
# Convert the matrix into a data frame
mix_df2 <- as.data.frame(matrix_data)

# Add row and column names
colnames(mix_df2) <- c("No", "Component", "Rate", "Weight", "Parameters")

combined_mod_mix_df <- rbind(mix_df1, mix_df2,fill = TRUE)
combined_mod_mix_df
##                                               Mix Class Model   No Component
## 1: Mixture model of substitution: MIX{TIM2+FO,JC,HKY+FO}+I+G4 <NA>      <NA>
## 2:                                                       <NA>    1      TIM2
## 3:                                                       <NA>    2        JC
## 4:                                                       <NA>    3       HKY
##      Rate Weight
## 1:   <NA>   <NA>
## 2: 1.0000 0.4908
## 3: 1.0000 0.3430
## 4: 1.0000 0.1662
##                                                              Parameters
## 1:                                                                 <NA>
## 2: TIM2{1.98418,8.94168,15.2375}+FO{0.17763,0.233357,0.305017,0.283996}
## 3:                                                                   JC
## 4:                HKY{2.99314}+FO{0.375944,0.415568,0.0779491,0.130539}

Summary table

one_model <- sub("^Model of substitution: ","\\1",grep("^Model of substitution:", one_line, value = TRUE))
mix_model <- sub("^Mixture model of substitution:", "\\1", mix_model <- grep("^Mixture model of substitution:", mix_line, value = TRUE))
# Rates, Likelihood, Unconstrained_likelihood
one_rates <- sub("Model of rate heterogeneity: ","\\1",grep("Model of rate heterogeneity: ", one_line, value = TRUE)) 
mix_rates <-sub("Model of rate heterogeneity: ","\\1",grep("Model of rate heterogeneity: ", mix_line, value = TRUE)) 
one_likelihood <- sub("Log-likelihood of the tree: " ,"\\1",grep("Log-likelihood of the tree: ", one_line, value = TRUE)) 
mix_likelihood <- sub("Log-likelihood of the tree: " ,"\\1",grep("Log-likelihood of the tree: ", mix_line, value = TRUE)) 

one_uncons_likelihood <- sub("Unconstrained log-likelihood \\(without tree\\):","\\1",grep("Unconstrained log-likelihood \\(without tree\\):", one_line, value = TRUE)) 
mix_uncons_likelihood <- sub("Unconstrained log-likelihood \\(without tree\\):","\\1",grep("Unconstrained log-likelihood \\(without tree\\):", mix_line, value = TRUE)) 
# parameters
one_para <- sub("Number of free parameters \\(#branches \\+ #model parameters\\): ","\\1",grep("Number of free parameters \\(#branches \\+ #model parameters\\): ", one_line, value = TRUE))
mix_para <- sub("Number of free parameters \\(#branches \\+ #model parameters\\): ","\\1",grep("Number of free parameters \\(#branches \\+ #model parameters\\): ", mix_line, value = TRUE))
# AIC,AICc, BIC,
one_AIC <- sub("Akaike information criterion \\(AIC\\) score: ","\\1",grep("Akaike information criterion \\(AIC\\) score: ", one_line, value = TRUE))
mix_AIC <- sub("Akaike information criterion \\(AIC\\) score: ","\\1",grep("Akaike information criterion \\(AIC\\) score: ", mix_line, value = TRUE))

one_AICC <- sub("Corrected Akaike information criterion \\(AICc\\) score: ","\\1",grep("Corrected Akaike information criterion \\(AICc\\) score: ", one_line, value = TRUE))
mix_AICC <- sub("Corrected Akaike information criterion \\(AICc\\) score: ","\\1",grep("Corrected Akaike information criterion \\(AICc\\) score: ", mix_line, value = TRUE))

one_BIC <- sub("Bayesian information criterion \\(BIC\\) score: ","\\1",grep("Bayesian information criterion \\(BIC\\) score: ", one_line, value = TRUE))
mix_BIC <- sub("Bayesian information criterion \\(BIC\\) score: ","\\1",grep("Bayesian information criterion \\(BIC\\) score: ", mix_line, value = TRUE))

if (one_BIC < mix_BIC) {
  best_model <- "One"
} else if (mix_BIC < one_BIC) {
  best_model <- "Mixture"
} else {
  best_model <- "Both models have the same BIC score"
}
# Create the data frame
model_data <- data.frame(
  row.names = c("One model", "Mixture model"),
  Classes = c(1, 3),
  Model_String = c(one_model, mix_model),
  Rates = c(one_rates, mix_rates),
  Likelihood = c(one_likelihood,mix_likelihood),
  Unconstrained_likelihood = c(one_uncons_likelihood,mix_uncons_likelihood),
  parameters = c(one_para,mix_para),
  AIC = c(one_AIC,mix_AIC),
  AICc = c(one_AICC,mix_AICC),
  BIC = c(one_BIC, mix_BIC),
  Best = rep(best_model,2),
  gene_name = rep(gene_names,2)
)

# Print the table
print(model_data)
##               Classes                 Model_String
## One model           1                   TIM2e+I+R4
## Mixture model       3  MIX{TIM2+FO,JC,HKY+FO}+I+G4
##                                          Rates                  Likelihood
## One model     Invar+FreeRate with 4 categories -24722.0640 (s.e. 872.7689)
## Mixture model    Invar+Gamma with 4 categories -24506.9433 (s.e. 863.3187)
##               Unconstrained_likelihood parameters        AIC       AICc
## One model                  -11045.4974        317 50078.1279 50201.1371
## Mixture model              -11045.4974        321 49655.8866 49782.3233
##                      BIC    Best   gene_name
## One model     51846.7242 Mixture EOG09150AOE
## Mixture model 51446.7995 Mixture EOG09150AOE